# importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
# let try to load the data
df = pd.read_csv("breast_cancer_wisconsin_data.csv")
# lets try to look the data
df.head()
| id | diagnosis | radius_mean | texture_mean | perimeter_mean | area_mean | smoothness_mean | compactness_mean | concavity_mean | points_mean | ... | radius_worst | texture_worst | perimeter_worst | area_worst | smoothness_worst | compactness_worst | concavity_worst | points_worst | symmetry_worst | dimension_worst | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 87139402 | B | 12.32 | 12.39 | 78.85 | 464.1 | 0.10280 | 0.06981 | 0.03987 | 0.03700 | ... | 13.50 | 15.64 | 86.97 | 549.1 | 0.1385 | 0.1266 | 0.12420 | 0.09391 | 0.2827 | 0.06771 |
| 1 | 8910251 | B | 10.60 | 18.95 | 69.28 | 346.4 | 0.09688 | 0.11470 | 0.06387 | 0.02642 | ... | 11.88 | 22.94 | 78.28 | 424.8 | 0.1213 | 0.2515 | 0.19160 | 0.07926 | 0.2940 | 0.07587 |
| 2 | 905520 | B | 11.04 | 16.83 | 70.92 | 373.2 | 0.10770 | 0.07804 | 0.03046 | 0.02480 | ... | 12.41 | 26.44 | 79.93 | 471.4 | 0.1369 | 0.1482 | 0.10670 | 0.07431 | 0.2998 | 0.07881 |
| 3 | 868871 | B | 11.28 | 13.39 | 73.00 | 384.8 | 0.11640 | 0.11360 | 0.04635 | 0.04796 | ... | 11.92 | 15.77 | 76.53 | 434.0 | 0.1367 | 0.1822 | 0.08669 | 0.08611 | 0.2102 | 0.06784 |
| 4 | 9012568 | B | 15.19 | 13.21 | 97.65 | 711.8 | 0.07963 | 0.06934 | 0.03393 | 0.02657 | ... | 16.20 | 15.73 | 104.50 | 819.1 | 0.1126 | 0.1737 | 0.13620 | 0.08178 | 0.2487 | 0.06766 |
5 rows × 32 columns
# lets try to check the shape of data
df.shape
(569, 32)
# check the column names of data
print(df.columns.tolist())
['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 'points_mean', 'symmetry_mean', 'dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', 'compactness_se', 'concavity_se', 'points_se', 'symmetry_se', 'dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'points_worst', 'symmetry_worst', 'dimension_worst']
# lets try to check the general information of all columns in dataset
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 569 non-null int64 1 diagnosis 569 non-null object 2 radius_mean 569 non-null float64 3 texture_mean 569 non-null float64 4 perimeter_mean 569 non-null float64 5 area_mean 569 non-null float64 6 smoothness_mean 569 non-null float64 7 compactness_mean 569 non-null float64 8 concavity_mean 569 non-null float64 9 points_mean 569 non-null float64 10 symmetry_mean 569 non-null float64 11 dimension_mean 569 non-null float64 12 radius_se 569 non-null float64 13 texture_se 569 non-null float64 14 perimeter_se 569 non-null float64 15 area_se 569 non-null float64 16 smoothness_se 569 non-null float64 17 compactness_se 569 non-null float64 18 concavity_se 569 non-null float64 19 points_se 569 non-null float64 20 symmetry_se 569 non-null float64 21 dimension_se 569 non-null float64 22 radius_worst 569 non-null float64 23 texture_worst 569 non-null float64 24 perimeter_worst 569 non-null float64 25 area_worst 569 non-null float64 26 smoothness_worst 569 non-null float64 27 compactness_worst 569 non-null float64 28 concavity_worst 569 non-null float64 29 points_worst 569 non-null float64 30 symmetry_worst 569 non-null float64 31 dimension_worst 569 non-null float64 dtypes: float64(30), int64(1), object(1) memory usage: 142.4+ KB
# lets try to check the missing values of each column in a dataset
df.isnull().sum()
id 0 diagnosis 0 radius_mean 0 texture_mean 0 perimeter_mean 0 area_mean 0 smoothness_mean 0 compactness_mean 0 concavity_mean 0 points_mean 0 symmetry_mean 0 dimension_mean 0 radius_se 0 texture_se 0 perimeter_se 0 area_se 0 smoothness_se 0 compactness_se 0 concavity_se 0 points_se 0 symmetry_se 0 dimension_se 0 radius_worst 0 texture_worst 0 perimeter_worst 0 area_worst 0 smoothness_worst 0 compactness_worst 0 concavity_worst 0 points_worst 0 symmetry_worst 0 dimension_worst 0 dtype: int64
print(df['diagnosis'].value_counts())
sns.countplot(x='diagnosis', data=df)
plt.plot()
B 357 M 212 Name: diagnosis, dtype: int64
[]
# Overall idea about distribution of data histogram numeric columns## Distribution of categorical variables
# sns.pairplot(df, hue = 'continent', diag_kind = 'kde',
# plot_kws = {'alpha': 0.6, 's': 80, 'edgecolor': 'k'},
# size = 4)
sns.pairplot(df)
plt.show()
<seaborn.axisgrid.PairGrid at 0x19daf9920d0>